####################################################
#Author: Kelli Marquardt
#Purpose: Simulate Fake doctor level dataset for replication package, Mis(sed) Diagnosis: Physician Decision Making and ADHD 

# Inputs:
# data/est_dat_fake.csv

# Outputs:
#- data/doc_dat_fake.csv

####################################################

############################
#0 load required packages
############################
rm(list = ls(all.names = TRUE))

required.packages = c("dplyr", "MASS")

missing=setdiff(required.packages, rownames(installed.packages()))

if (length(missing) > 0) {
  message("Installing missing packages: ", paste(missing, collapse = ", "))
  install.packages(missing, dependencies = TRUE)
} else {
  message("All required packages are already installed.")
}
rm(missing, required.packages)

#load packages
library(MASS)
library(dplyr) 


####################################
# set seed for replication
seed_for_fake_data=12345
set.seed(seed_for_fake_data+100)
####################################


######################################
#step 1: read in the fake est_dat and pull first_pcp_id and doc_id 
######################################

est_dat=read.csv("data/est_dat_fake.csv", stringsAsFactors = F)
first_pcp_id=na.omit(unique(est_dat$first_pcp_id))
diag_doc_id=na.omit(unique(est_dat$diag_doc))

######################################
#step 2: get SIP docs given the overlap in table B1
######################################
n_sip=230
#there are ~230 SIPs (14% are IPCP and 70% are DP)
sip_ids = sprintf("%05d", sample(0:99999, size = (n_sip), replace = FALSE))

#check are any in ipcp or dp? A: No
sum(sip_ids %in% first_pcp_id)
sum(sip_ids %in% diag_doc_id)

#need 14% to be an IPCP and 70% to be DP 
sip_ipcp = min(length(first_pcp_id), floor(.14*n_sip))
sip_dp = min(length(diag_doc_id), floor(.70*n_sip))

sip_ipcp=sample(first_pcp_id, sip_ipcp, replace=F)
sip_dp=sample(diag_doc_id, sip_dp, replace=F)

#replace first sip_ids with unique(c(sip_ipcp, sip_dp))
sip_ids[1:length(unique(c(sip_ipcp, sip_dp)))]=unique(c(sip_ipcp, sip_dp))

######################################
#step 3: put all together 
######################################
full_doc_list=unique(c(sip_ids, first_pcp_id, diag_doc_id))
full_doc_dat=as.data.frame(full_doc_list)
colnames(full_doc_dat)=c("doc_id")
full_doc_dat=full_doc_dat%>%
  mutate(SIP=as.integer(doc_id %in% sip_ids),
         IPCP=as.integer(doc_id %in% first_pcp_id),
         DP=as.integer(doc_id %in% diag_doc_id))

######################################
#step 4: assign random values for characteristics 
#for simplicity, follow means in column 1 
######################################
set.seed(seed_for_fake_data+200)
### ever_pcp
#ever_pcp=1 if IPCP=1 (and X to others such that mean is 0.52)
  #note 23% of sip are IPCP, so only need the difference 
n_sip_ipcp=nrow(full_doc_dat%>%filter(SIP==1 & IPCP==1))
n_sip_pcp_needed=floor(.52*n_sip)-n_sip_ipcp

full_doc_dat=full_doc_dat%>%
  mutate(ever_pcp=ifelse(IPCP==1, 1, 0))

id_fill=which(full_doc_dat$ever_pcp==0)

full_doc_dat$ever_pcp[id_fill[1:n_sip_pcp_needed]] =1


#build the different id sets (already have first_pcp_id and diag_doc_id)
ipcp_ids=first_pcp_id
dp_ids=diag_doc_id

both_ids=intersect(ipcp_ids, dp_ids)
dp_only_ids=setdiff(dp_ids, both_ids)
ipcp_only_ids=setdiff(ipcp_ids, both_ids)


### credentials 
#.767 are MD/DO, .145 other, rest unknown 
full_doc_dat= full_doc_dat %>%
  mutate(credentials= sample( 
    x=c("md_do","other","unknown"),
    size=n(), 
    replace=T, 
    prob=c(.767,.145, .088)))

### specialty 
#.238 are psych, . 652 are general, rest are other or unknown 
full_doc_dat= full_doc_dat %>%
  mutate(specialty= sample( 
    x=c("psych","gen_med","other_unknown"),
    size=n(), 
    replace=T, 
    prob=c(.238,.652, .110)))

### experience  
#.568 are less than 5 yrs, .145 are more than 15 years, .141 are unknown, rest are between 5 and 15
full_doc_dat= full_doc_dat %>%
  mutate(experience= sample( 
    x=c("le5","ge15","unknown", "5to15"),
    size=n(), 
    replace=T, 
    prob=c(.568,.145,.141, .146)))

### education  
#.736 went to us med school, .106 unknown, rest went outside of us 
full_doc_dat= full_doc_dat %>%
  mutate(med_school= sample( 
    x=c("us_medschool","unknown","int_medschool"),
    size=n(), 
    replace=T, 
    prob=c(.736,.106,.158)))




#################################################################################
# save
#################################################################################

write.csv(full_doc_dat, "data/doc_dat_fake.csv", row.names = F)


# END OF SCRIPT